Field sampling design - IMOS GBR-MGD initiative

Seawater samples were collected for (1) microbial metagenomics and (2) physico-chemical data (temperature, salinity, and particulate/dissolved nutrient concentrations) from 48 offshore reefs across the length of the GBR, within the Great Barrier Reef Microbial Genomics Database (GBR-MGD) initiative by Australia’s Integrated Marine Observing System (IMOS). This sampling was done alongside ongoing in situ health surveys by the Australian Institute of Marine Science Long-Term Monitoring Program (AIMS-LTMP).

The code below was used to create a map showing the 48 IMOS GBR-MGD sites, by combining these two tutorials: 1. https://open-aims.github.io/gisaimsr/articles/examples.html 2. https://r-spatial.org/r/2018/10/25/ggplot2-sf-2.html

# Importing the coordinates
map_coords <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Tom_Jenkins_script/all_IMOS-MGD_seawater_subset/Metadata_files/MARKO_for_eReefs_Lats_Longs.csv")
# View(map_coords)

# Now converting from data frame into sf format
map_coords <- st_as_sf(map_coords, 
                       coords = c("lon", "lat"), 
                       remove = FALSE, 
                       crs = 4283, # this is the reference code for the CRS system GDA94, used by dataaimsr & gisaimsr R packages
                       agr = "constant")

# I will now add the info on Sampling trip - this will be needed when plotting

# This is the final metadata file, with average values of env. measurements (averaged per Reef site)
map_reef_names_and_trip <- read.csv(file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/00_FINAL_PIPELINE/01_Preparation_of_Environmental_Metadata/metadata_with_reef_names_maps.csv") %>% 
  dplyr::select(REEF_NAME, Sampling_trip)

### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
map_reef_names_and_trip$Sampling_trip <- gsub("First", # String to search for
                               "Trip_01_Nov-Dec_2019", # Replace with this
                               as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in

# Second trip
map_reef_names_and_trip$Sampling_trip <- gsub("Second", # String to search for
                               "Trip_02_January_2020", # Replace with this
                               as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in

# Third trip
map_reef_names_and_trip$Sampling_trip <- gsub("Third", # String to search for
                               "Trip_03_February_2020", # Replace with this
                               as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in

# Fourth trip
map_reef_names_and_trip$Sampling_trip <- gsub("Fourth", # String to search for
                               "Trip_04_July_2020", # Replace with this
                               as.character(map_reef_names_and_trip$Sampling_trip)) # Column to search in

# Merging with 'Sampling trip' info
map_coords <- left_join(map_coords, map_reef_names_and_trip, by = c("name" = "REEF_NAME"))

#########################
# Now plotting

# And now defining colors for the map
cols_map <- c("tomato3", # Enclosed Coastal 
              "salmon3", # Macro Tidal Enclosed Coastal
              "pink3", # Macro Tidal Open Coastal
              "peachpuff", # Midshelf
              "lightsteelblue", # Offshore
              "lightcoral") # Open coastal

# Plotting without the mainland - otherwise I am just losing precious space
gbr_no_mainland <- gbr_feat %>%
  dplyr::filter(FEAT_NAME != "Mainland")

# ------------------------------------------------ #
# Overlaying IMOS-MGD sites - only as points first #
# ------------------------------------------------ #

col.per.trip <- factor(map_coords$Sampling_trip, levels = c("Trip_01_Nov-Dec_2019",
                                                            "Trip_02_January_2020",
                                                            "Trip_03_February_2020",
                                                            "Trip_04_July_2020"))
colors <- c("indianred", # Sampling trip 1
            "indianred4", # Sampling trip 2 
            "red3", # Sampling trip 3
            "slateblue") # Sampling trip 4
names(colors) <- c("Trip_01_Nov-Dec_2019",
                   "Trip_02_January_2020",
                   "Trip_03_February_2020",
                   "Trip_04_July_2020")

# Importing city coordinates
oz_cities <- read.csv("~/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Tom_Jenkins_script/all_IMOS-MGD_seawater_subset/Input_files/oz_cities.csv")

# Plot
IMOS_MGD_dots_trip = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
# Include the region info too (the 3 lines below)
#    geom_sf(data = nrm_regions,
#  mapping = aes(fill = NAME), lwd = 0.01) +
#  scale_fill_brewer(name = "Region", palette = "Set3") +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat, lwd = 0.01,
          fill = "seashell2"
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        #        alpha = 0.6, # This is to ensure 
        show.legend = "point") +
  coord_sf(xlim = c(142, 154), ylim = c(-10, -27)) +
  geom_text_repel(data = oz_cities, aes(x = Longitude, y = Latitude, label = Town), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  col = 'black',
                  nudge_x = c(-0.5, # Townsville
                              -0.9, # Brisbane
                              -0.5, # Cairns
                              -0.8, # Cooktown
                              -0.5, # Mackay
                              -0.7), # Bundaberg
                  nudge_y = c(-0.5, # Townsville
                              0.5, # Brisbane
                              -0.5, # Cairns
                              -0.5, # Cooktown
                              -0.5, # Mackay
                              -0.5), # Bundaberg
  )+ 
  scale_color_manual(name = "Dates of Sampling Transects", values=colors)+
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5, linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS-MGD",
       subtitle = "Microbial Genomics Database sites") +
#  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_dots_trip
Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.

Field sampling design for the GBR-MGD (Great Barrier Reef Microbial Genomics Database) dataset. (above) Seawater was collected from 48 offshore GBR reef sites for microbial community metagenomic sequencing and water chemistry analysis over four trips between November 2019 and July 2020. Reef sites are coloured in red or blue tones to denote trips that occurred during the austral summer (wet season) or austral winter (dry season), respectively. (bellow) A more detailed map showing the name of each reef site, and their membership to either offshore (41 reefs) or mid-shelf (7 reefs) waters. No inshore sites were sampled.

To show reefs in more detail, we also plot a close-up of sites within each trip.

# But need to split map_coords file per trip
map_coords_trip1 <- filter(map_coords, Sampling_trip=="Trip_01_Nov-Dec_2019")
map_coords_trip2 <- filter(map_coords, Sampling_trip=="Trip_02_January_2020")
map_coords_trip3 <- filter(map_coords, Sampling_trip=="Trip_03_February_2020")
map_coords_trip4 <- filter(map_coords, Sampling_trip=="Trip_04_July_2020")
IMOS_MGD_trip1 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
  geom_sf() +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2"
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip1, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        show.legend = "point") +
  geom_text_repel(data = map_coords_trip1, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  segment.color = "black",
                  segment.alpha = 0.6,
                  segment.size = 0.1,
                  nudge_x = c(1.2, # MCSWEENEY REEF 
                              1.2, # MONSOON REEF, - sign means it will move to the left 
                              1.2, # 11-049
                              1.2, # 11-162
                              0.9, # MANTIS REEF
                              0.6, # LAGOON REEF
                              0.4, # DAVIE REEF
                              -0.2, # CORBETT REEF
                              0.4, # 13-124
                              -0.1), # SANBANK 1 REEF
                  # This should be 48 times, for our 48 sites
                  nudge_y = c(0.2, # MCSWEENEY REEF 
                              0.1, # MONSOON REEF 
                              0.1, # 11-049, - sign means it will go down
                              0.1, # 11-162
                              0.2, # MANTIS REEF
                              -0.3, # LAGOON REEF
                              0.2, # DAVIE REEF
                              -0.8, # CORBETT REEF
                              0.3, # 13-124
                              -0.6), ) + # SANDBANK 1 REEF
  coord_sf(xlim = c(143, 147), ylim = c(-11, -16.5)) +
  scale_color_manual(name = "Sampling trip", values=c("indianred")) + # color I am using for Sampling trip 1
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5, linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS Microbial Genomics Database sites",
       subtitle = "Trip 1 (Nov-Dec 2019)")
  #  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
#  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip1

IMOS_MGD_trip2 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
  geom_sf() +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2"
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip2, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        show.legend = "point") +
  geom_text_repel(data = map_coords_trip2, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  segment.color = "black",
                  segment.alpha = 0.6,
                  segment.size = 0.1,
                  nudge_x = c(-0.1, # FAIRFAX REEF
                    -0.4, # HOSKYN REEF
                    0.3, # BOULT REEF
                    0.3, # MASTHEAD REEF
                    -0.2, # ERSKINE REEF
                    0.4, # BROOMFIELD REEF
                    0.1, # 21-550
                    -0.5, # 22-084
                    0.4, # CHINAMAN REEF
                    -0.1, # 21-580
                    0.2, # SMALL LAGOON REEF
                    -0.3), # NORTH REEF
                  # This should be 48 times, for our 48 sites
                  nudge_y = c(-0.1, # FAIRFAX REEF
                    -0.1, # HOSKYN REEF
                    0.2, # BOULT REEF
                    -0.1, # MASTHEAD REEF
                    0.2, # ERSKINE REEF
                    0.2, # BROOMFIELD REEF
                    0.2, # 21-550
                    -0.3, # 22-084
                    0.2, # CHINAMAN REEF
                    -0.6, # 21-580
                    0.4, # SMALL LAGOON REEF
                    0.1), # NORTH REEF
                  ) + # SANDBANK 1 REEF
  coord_sf(xlim = c(151, 153), ylim = c(-21.5, -24)) +
  scale_color_manual(name = "Sampling trip", values=c("indianred4")) + # color I am using for Sampling trip 1
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5, linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS Microbial Genomics Database sites",
       subtitle = "Trip 2 (January 2020)")
  #  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
#  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip2

IMOS_MGD_trip3 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
  geom_sf() +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2"
  ) +
##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip3, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        show.legend = "point") +
  geom_text_repel(data = map_coords_trip3, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  segment.color = "black",
                  segment.alpha = 0.6,
                  segment.size = 0.1,
                  nudge_x = c(#0.3, # ST CRISPIN
                              0.3, # AGINCOURT1 REEF
                              0.3, # HASTINGS REEF
                              0.3, # ARLINGTON REEF
                              0.4, # THETFORD REEF
                              0.4, # MOORE REEF
                              0.3, # HEDLEY REEF
                              0.3, # MCCULLOCH REEF
                              0.4, # PEART REEF
                              0.4, # FEATHER REEF
                              0.1, # FARQUAHARSON REEF
                              0.3), # TAYLOR REEF
                  # This should be 48 times, for our 48 sites
                  nudge_y = c(#-0.1, # ST CRISPIN
                              0.2, # AGINCOURT1 REEF
                              0.2, # HASTINGS REEF
                              0.1, # ARLINGTON REEF
                              0.2, # THETFORD REEF
                              0.1, # MOORE REEF
                              0.1, # HEDLEY REEF
                              0.2, # MCCULLOCH REEF
                              0.1, # PEART REEF
                              -0.1, # FEATHER REEF
                              0.2, # FARQUAHARSON REEF
                              -0.1), # TAYLOR REEF
  ) +
  coord_sf(xlim = c(145.4, 147), ylim = c(-15.8, -18)) +
  scale_color_manual(name = "Sampling trip", values=c("red"))+
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5,
                                        linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS Microbial Genomics Database sites",
       subtitle = "Trip 3 (February 2020)") +
  #  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip3

IMOS_MGD_trip4 = ggplot(data = gbr_feat) + # gbr_feat is a crs object from the AIMS GIS packages
  #  geom_sf(data = gbr_bounds, fill = "darkred", colour = NA) + # Coloring the boundaries of the GBR MP
  geom_sf() +
  #  geom_sf(data = wbodies,
  #          mapping = aes(fill = MarineWate), # Commenting out as I don't need water bodies in color here
  #          lwd = 0.01) +
  geom_sf(data = gbr_feat,
          lwd = 0.01,
          fill = "seashell2"
  ) +
  ##############################################################################################
# Adding our sites as dots
##############################################################################################
geom_sf(data = map_coords_trip4, # Needs to be a data frame, requires 'geometry'
        aes(color = Sampling_trip), # Coloring sites per sampling trip
        show.legend = "point") +
  geom_text_repel(data = map_coords_trip4, aes(x = lon, y = lat, label = name, colour = Sampling_trip), # repel to make sure the names do not overlap
                  fontface = "bold", # to have the reef names in bold 
                  size=3.2, 
                  segment.color = "black",
                  segment.alpha = 0.6,
                  segment.size = 0.1,
                  nudge_x = c(-0.2, # LITTLE KELSO REEF
                              -0.2, # KELSO REEF
                              0.5, # ROXBURGH REEF
                              0.2, # FORE&AFT REEF
                              0.2, # RIB REEF
                              -0.1, # JOHN BREWER REEF
                              0.1, # MYRMIDON REEF
                              -0.3, # CHICKEN REEF
                              0.3, # KNIFE REEF
                              0.2, # FORK REEF
                              0.2, # LYNCHS REEF
                              -0.1, # CENTIPEDE REEF
                              -0.1, # GRUB REEF
                              -0.2), # HELIX REEF
                  # This should be 48 times, for our 48 sites
                  nudge_y = c(-0.2, # LITTLE KELSO REEF
                              -0.1, # KELSO REEF
                              0.1, # ROXBURGH REEF
                              0, # FORE&AFT REEF
                              0.1, # RIB REEF
                              -0.2, # JOHN BREWER REEF
                              0.1, # MYRMIDON REEF
                              -0.3, # CHICKEN REEF
                              0.1, # KNIFE REEF
                              0, # FORK REEF
                              -0.2, # LYNCHS REEF
                              -0.1, # CENTIPEDE REEF
                              -0.1, # GRUB REEF
                              -0.2), # HELIX REEF
  ) +
  coord_sf(xlim = c(146.8, 148), ylim = c(-18.1, -19)) +
  scale_color_manual(name = "Sampling trip", values=c("slateblue"))+
  theme_classic() +
  theme(panel.background = element_rect(fill = "lightblue3",
                                        colour = "lightblue3",
                                        size = 0.5,
                                        linetype = "solid")) +
  labs(x = "Longitude",
       y = "Latitude",
       title = "IMOS Microbial Genomics Database sites",
       subtitle = "Trip 4 (July 2020)") +
  #  scale_fill_manual(name = "Type of Water Body", values = cols_map) +
  theme(legend.direction = "vertical", legend.box = "vertical")
IMOS_MGD_trip4

Physico-chemical data - collection, pre-processing, and analysis

wrangling <- read.csv("~/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/all_seawater_0-05/metadata_wrangling.csv")

WQ_methods <- read.csv("~/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/Getting_the_environmental_data/Data_from_the_water_quality_team/FINAL_QCd_files/WQ_IDs_IMOS-MGD_only.csv")
# And this is one of the WQ spreadsheets - link between WQ IDs and our Reef names 

# Selecting only the columns of interest
WQ_methods <- dplyr::select(WQ_methods, one_of(c("REEF_NAME",
                                          "WQ_Station_Name",
                                          "Collection_method", # diving or from boat
                                          "Sample_collection_start")))#,
# We decided not to include the metrics bellow
                                          # "Swell_direction",
                                          # "Swell_height",
                                          # "Wind_direction",
                                          #"Wind_speed" )))

# Additional data from the LTMP trips
LTMP <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/all_seawater_0-05/metadata_IMOS_from_Mike.csv") %>% 
  dplyr::select("REEF_NAME", "Sampling_trip", "GBR_sector", "SAMPLE_DATE", "Lat", "Long")
### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
LTMP$Sampling_trip <- gsub("First", # String to search for
                               "Trip_01_Nov-Dec_2019", # Replace with this
                               as.character(LTMP$Sampling_trip)) # Column to search in

# Second trip
LTMP$Sampling_trip <- gsub("Second", # String to search for
                               "Trip_02_January_2020", # Replace with this
                               as.character(LTMP$Sampling_trip)) # Column to search in

# Third trip
LTMP$Sampling_trip <- gsub("Third", # String to search for
                               "Trip_03_February_2020", # Replace with this
                               as.character(LTMP$Sampling_trip)) # Column to search in

# Fourth trip
LTMP$Sampling_trip <- gsub("Fourth", # String to search for
                               "Trip_04_July_2020", # Replace with this
                               as.character(LTMP$Sampling_trip)) # Column to search in

# Joining - first step
metadata <- left_join(wrangling, LTMP)
# In this step I added the Sample IDs to the LTMP data
metadata <- left_join(metadata, WQ_methods)
# And in here the info from the WQ team

# importing the actual water chemistry measurements
WQ_Result_Report <- read.csv("~/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/Getting_the_environmental_data/Data_from_the_water_quality_team/FINAL_QCd_files/MBM_Result_Report_R_Dec_2022.csv")

# Removing Temperature and Salinity for now - too many missing values
WQ_Result_Report <- dplyr::select(WQ_Result_Report, one_of(c("WQ_Station_Name",
                                                      "DEPTH",
                                                      "Chlorophyll_a_.µg.L.",
                                                      "Phaeophytin_a_.µg.L.",
                                                      "PN_.µM.",
                                                      "POC_.µM.",
                                                      "PP_.µM.",
                                                      "DOC_.µM.",
                                                      "PO4_.µM.",
                                                      "NH4_.µM.",
                                                      "NO2_.µM.",
                                                      "NO3_.µM.",
                                                      "Si_.µM.",
                                                      "TDN_.µM.",
                                                      "TDP_.µM.",
                                                      "TSS_.mg.L.")))
                                                      # "Salinity",
                                                      # "Temperature.C..")))

# Now adding the Reef_name info
wq.all.reps.for.pca <- left_join(WQ_methods, WQ_Result_Report)

# I also need the info on Sampling trips - to color the groups on the PCA. I will also add the data from the research vessel at this stage - Temperature, Salinity, Turbidity, Fluorescence
reefs_trips <- read.csv("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/00_FINAL_PIPELINE/01_Preparation_of_Environmental_Metadata/metadata_with_reef_names.csv")
# But I only want reef names and their corresponding trips for now!
reefs_trips <- reefs_trips[,c(1,3)]

### 2 ### Renaming the sampling trips to include dates, and make sure they are ordered alphabetically
# First trip
reefs_trips$Sampling_trip <- gsub("First", # String to search for
                                              "Trip_01_Nov-Dec_2019", # Replace with this
                                              as.character(reefs_trips$Sampling_trip)) # Column to search in

# Second trip
reefs_trips$Sampling_trip <- gsub("Second", # String to search for
                                              "Trip_02_January_2020", # Replace with this
                                              as.character(reefs_trips$Sampling_trip)) # Column to search in

# Third trip
reefs_trips$Sampling_trip <- gsub("Third", # String to search for
                                              "Trip_03_February_2020", # Replace with this
                                              as.character(reefs_trips$Sampling_trip)) # Column to search in

# Fourth trip
reefs_trips$Sampling_trip <- gsub("Fourth", # String to search for
                                              "Trip_04_July_2020", # Replace with this
                                              as.character(reefs_trips$Sampling_trip)) # Column to search in

# Now adding the metadata from the RV
vessel_metadata <- read.csv("/home/markoterzin/Documents/PhD/FROM/from_Patrick/IMOS-MGD_additional_metadata/GBR-Genomics-Database_Seawater-Illumina-Reads.csv")
# Keeping only: Temperature, Salinity, Turbidity, Fluorescence
vessel_metadata <- dplyr::select(vessel_metadata, one_of(c("REEF_NAME",
                                                           "SEAWATER_TEMPERATURE_2.5m_RV",
                                                           "SALINITY_2.5m_RV",
#                                                           "TURBIDITY_2.5m_RV",
                                                           "FLUORESCENCE_2.5m_RV")))

# Joining now
reefs_trips <- left_join(reefs_trips,
                         vessel_metadata)
wq.all.reps.for.pca <- left_join(wq.all.reps.for.pca,
                                 reefs_trips)

Principal Components Analysis (PCA) - What are the main clustering patterns between our reefs based on physico-chemical data?

PCA was applied on a physico-chemical dataset containing 17 variables, including: 1. 14 water chemistry variables: ammonia (NH4), nitrite (NO2), nitrate (NO3), total dissolved nitrogen (TDN), phosphate (PO4), total dissolved phosphorus (TDP), dissolved organic carbon (DOC), silicate (Si), total suspended solids (TSS), chlorophyll a (Chl-a), phaeophytin a (Phaeo), particulate organic carbon (POC), particulate nitrogen (PN), and particulate phosphorus (PP). For each of these 14 water chemistry variables, triplicate 5 L seawater samples were collected using Niskin bottles for analysis of water chemistry variables, at each of the 48 reefs. 2. temperature, fluorescence, and salinity measurements from the underway sampling systems on the RV Solander and RV Cape Ferguson, with intake depths for underway systems were 1.9 m (RV Cape Ferguson) and 2.5 m (RV Solander). For these three measurements, one value per reef site was recorded.

Choosing the number of components

The mixOmics function tune.pca() calculates the cumulative proportion of explained variance for a large number of principal components (here we set ncomp = 10). A screeplot of the proportion of explained variance relative to the total amount of variance in the data for each principal component is output.

# In PCA, we first count the number of missing values, as this will tell us whether PCA will be solved using SVD (no missing values) or iterative NIPALS (with missing values) internally in the mixOmics function pca().
sum(is.na(wq.all.reps.for.pca[,c(6:14, 16:23)]))
## [1] 17
# Number of NAs
## [1] 17
# Since we have some missing values, the iterative NIPALS will be called inside pca()

tune.pca.WQ <- tune.pca(wq.all.reps.for.pca[,c(6:14, 16:23)], ncomp = 10, scale = TRUE)
plot(tune.pca.WQ)
Screeplot from the PCA performed on the IMOS GBR-MGD physico-chemical data: Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

Screeplot from the PCA performed on the IMOS GBR-MGD physico-chemical data: Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

# Numerical output
pca.wq.all.reps <- pca(wq.all.reps.for.pca[,c(6:14, 16:23)], # getting the numerical values only
                       ncomp = 10,
                       center = TRUE,
                       scale = TRUE)

# Explained variance per PCA component
knitr::kable(pca.wq.all.reps$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
The proportion of explained variance per each PCA component is:
x
PC1 0.4066099
PC2 0.1964915
PC3 0.0910662
PC4 0.0643839
PC5 0.0470814
PC6 0.0422856
PC7 0.0305387
PC8 0.0288760
PC9 0.0231195
PC10 0.0200885
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.wq.all.reps$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
The cumulative proportion of variance explained by each PCA component
x
PC1 0.4066099
PC2 0.6031014
PC3 0.6941676
PC4 0.7585515
PC5 0.8056329
PC6 0.8479185
PC7 0.8784572
PC8 0.9073332
PC9 0.9304527
PC10 0.9505412

Visualising patterns based on the final PCA model (with only the first two PCA dimensions)

PCA_WQ_sample_plot <- plotIndiv(pca.wq.all.reps,
          comp = c(1, 2), 
          group = wq.all.reps.for.pca$Sampling_trip, 
#          ind.names = wq.all.reps.for.pca$REEF_NAME,
          ellipse = T,
          col.per.group =c("indianred", # Sampling trip 1
                           "indianred4", # Sampling trip 2 
                           "red3", # Sampling trip 3
                           "slateblue"), # Sampling trip 4
          legend = TRUE,
          title = 'WQ Metadata all reps, PCA comp 1 - 2')

PCA_WQ_biplot <- biplot(pca.wq.all.reps,
       comp = c(1, 2), 
       group = wq.all.reps.for.pca$Sampling_trip,
#       ind.names = wq.all.reps.for.pca$REEF_NAME,
       col.per.group =c("indianred", # Sampling trip 1
                        "indianred4", # Sampling trip 2
                        "red3", # Sampling trip 3
                        "slateblue"), # Sampling trip 4
       legend = TRUE,
       legend.title = "Sampling trip",
       title = 'PCA biplot for WQ Metadata all reps, PCA comp 1 - 2')

The PCA results suggest that our water chemistry measurements from across the GBR were largely driven by seasonality, while geography had a weaker influence. Chemistry profiles of samples collected in early austral summer were comparable despite being >1500 km apart in the far north (Cape Grenville and Princess Charlotte bay sectors) and far south (Swains and Capricorn Bunker sectors) of the GBR, whereas samples collected during the peaks of austral summer and winter were the most distinct although they were geographically close in the central GBR (~200 km apart, Cairns and Cooktown / Lizard island sectors for austral summer samples, and Innisfail and Townsville sectors for austral winter samples). Further, we observe that summer trips 1-3 were characterised by elevated temperature and higher concentrations of dissolved and particulate nutrients, apart frpm TDP and phosphate which were elevated during winter.

However, we did not show reef names in either of the PCA plots as there is an overlap between data points (and hence the text is not readable), and also in these PCA visualisations, we lose context of raw values. This information was added with a heatmap (to compare physico-chemical metrics across sites) and with boxplots (which show the raw physico-chemical measurements).

Heatmap showing physico-chemical measurements across sites

We first collapsed the data to a mean/median value because for each of the 17 environmental metrics we computed the median value per reef site as the number of Niskin deployments differed for molecular (four replicates) and water chemistry (three replicates) sampling.

# Making the heatmap of LTMP data
WQ_heatmap <- metadata[,24:40] %>% # I am only choosing columns with median values
  scale(center = TRUE, scale = TRUE) %>% # I wand the values to be scaled
  as.data.frame() %>% # Converting back to data frame - ggplot needs this
  rownames_to_column("Sample_ID") %>% # Setting rownames as Col 1 - will need this for melting
  reshape2::melt() %>% # Getting the long format - this is what geom_tile needs
  left_join(metadata[, c(1,2)] # adding back the REEF NAME and SAMPLING TRIP vars
            %>% rownames_to_column("Sample_ID")
            ) %>% # Need to convert row names to Column 1 and give Sample_ID as name, because I am joining those with the same ID
  ggplot(aes(x = REEF_NAME, y = variable, fill = value)) +
  geom_tile() + # Plotting the heatmap here
  scale_fill_gradient2(low = "#075AFF",
                       mid = "#FFFFCC",
                       high = "#FF0000") + # The coloring scheme - red for high vals, blue for low
  facet_wrap(~Sampling_trip, scales = "free_x", ncol = 4) + # now facetting reef sites based on the Sampling trip
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
WQ_heatmap
The heatmap shows the level of change in all 17 physico-chemical variables (y axis) across the reef sites (x axis), grouped within their corresponding sampling trip. Environmental measurements were centered (median = 0) and scaled (standard deviation (SD) = 1) across reef sites, and values that deviate from the median (0) were shown in red (> median) and blue (< median). This heatmap was combined in Inkscape with the PCA visualisation for physico-chemical data to re-introduce the context of reef sites, which were not visualised in the PCA.

The heatmap shows the level of change in all 17 physico-chemical variables (y axis) across the reef sites (x axis), grouped within their corresponding sampling trip. Environmental measurements were centered (median = 0) and scaled (standard deviation (SD) = 1) across reef sites, and values that deviate from the median (0) were shown in red (> median) and blue (< median). This heatmap was combined in Inkscape with the PCA visualisation for physico-chemical data to re-introduce the context of reef sites, which were not visualised in the PCA.

Box plots showing raw physico-chemical measurements, as well as mean and median values

# Median is the default in ggplot2
reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
                                      6:19, # All numerical vals
                                      21, # Temperature
                                      22, # Salinity
                                      23, # Fluorescence
                                      20)]) %>% # Sampling trip
  ggplot(aes(y = value,
             x = Sampling_trip,
             fill = Sampling_trip),
         alpha=0.8) +
  geom_boxplot(#outlier.colour="red",
               outlier.shape=8,
               outlier.size=4) +
  geom_jitter(alpha = 0.6,
              size = 0.8) +
  stat_summary(fun=mean,
               geom="point",
               shape=20,
               size=0.8,
               color="seagreen1",
               fill="seagreen1") + # Plotting the mean as a green dot!
  facet_grid(rows = vars(variable),
             cols = vars(Sampling_trip),
             scales = "free"
             ) +
  scale_fill_manual(values = c("indianred", # Sampling trip 1
                           "indianred4", # Sampling trip 2 
                           "red3", # Sampling trip 3
                           "slateblue") # Sampling trip 4
                    ) +
  labs(y = "WQ metrics",
       x = "Reef sites",
       title = "Boxplots for WQ metrics (Median & Mean)"
       ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))
FIG CAP TO BE ADDED.

FIG CAP TO BE ADDED.

Getting the numerical summary of physico-chemical variables

# Data needs to be in long format
wq_median_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
                                      6:19, # All numerical vals
                                      21, # Temperature
                                      22, # Salinity
                                      23, # Fluorescence
                                      20)]) %>% # Sampling_trip
  as.data.frame() %>% 
  group_by(Sampling_trip, variable) %>% 
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
          median=round(median(value, na.rm=TRUE),
                       digits = 2)
          ) %>%
  reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_median_per_trip, caption = "Median for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
Median for all 17 physico-chemical metrics, collapsed across the four sampling trips.
variable Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Chlorophyll_a_.µg.L. 0.17 0.15 0.23 0.10
Phaeophytin_a_.µg.L. 0.17 0.17 0.36 0.10
PN_.µM. 1.23 1.19 1.30 0.50
POC_.µM. 7.12 7.77 9.74 3.52
PP_.µM. 0.04 0.04 0.06 0.02
DOC_.µM. 83.75 79.58 65.83 69.53
PO4_.µM. 0.05 0.04 0.02 0.09
NH4_.µM. 0.32 0.52 0.68 0.11
NO2_.µM. 0.02 0.04 0.03 0.01
NO3_.µM. 0.21 0.34 0.21 0.20
Si_.µM. 1.38 1.14 2.00 1.86
TDN_.µM. 5.43 6.53 5.70 5.28
TDP_.µM. 0.20 0.23 0.16 0.26
TSS_.mg.L. 0.43 0.13 0.08 0.05
SEAWATER_TEMPERATURE_2.5m_RV 27.78 27.16 30.16 24.40
SALINITY_2.5m_RV 35.27 35.55 34.72 35.16
FLUORESCENCE_2.5m_RV 0.10 0.11 0.32 0.09
# Data needs to be in long format
wq_mean_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
                                      6:19, # All numerical vals
                                      21, # Temperature
                                      22, # Salinity
                                      23, # Fluorescence
                                      20)]) %>% # Sampling_trip
  as.data.frame() %>% 
  group_by(Sampling_trip, variable) %>% 
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
          mean=round(mean(value, na.rm=TRUE),
                     digits = 2)
          ) %>%
  reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_mean_per_trip, caption = "Mean for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
Mean for all 17 physico-chemical metrics, collapsed across the four sampling trips.
variable Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Chlorophyll_a_.µg.L. 0.18 0.16 0.32 0.11
Phaeophytin_a_.µg.L. 0.18 0.20 0.36 0.10
PN_.µM. 1.23 1.27 1.32 0.50
POC_.µM. 8.06 7.60 9.95 3.66
PP_.µM. 0.05 0.05 0.07 0.02
DOC_.µM. 84.51 81.92 67.22 69.30
PO4_.µM. 0.05 0.04 0.02 0.10
NH4_.µM. 0.39 0.58 0.74 0.12
NO2_.µM. 0.03 0.04 0.04 0.01
NO3_.µM. 0.30 0.33 0.35 0.23
Si_.µM. 1.41 1.30 2.10 1.79
TDN_.µM. 5.47 6.62 5.64 5.18
TDP_.µM. 0.20 0.23 0.16 0.26
TSS_.mg.L. 0.48 0.15 0.35 0.12
SEAWATER_TEMPERATURE_2.5m_RV 27.78 27.13 30.01 24.22
SALINITY_2.5m_RV 35.35 35.52 34.71 35.16
FLUORESCENCE_2.5m_RV 0.10 0.10 0.34 0.13
# Data needs to be in long format
wq_sd_per_trip <- reshape2::melt(wq.all.reps.for.pca[,c(1, # Reef name
                                      6:19, # All numerical vals
                                      21, # Temperature
                                      22, # Salinity
                                      23, # Fluorescence
                                      20)]) %>% # Sampling_trip
  as.data.frame() %>% 
  group_by(Sampling_trip, variable) %>% 
# Now computing mean and SD
dplyr::summarize( # This tutorial for troubleshooting! https://stackoverflow.com/questions/46661461/calculate-mean-by-group-using-dplyr-package
          sd=round(sd(value, na.rm=TRUE),
                   digits = 2)
          ) %>%
  reshape2::dcast(variable~Sampling_trip)
# Showing as table
knitr::kable(wq_sd_per_trip, caption = "SD for all 17 physico-chemical metrics, collapsed across the four sampling trips.")
SD for all 17 physico-chemical metrics, collapsed across the four sampling trips.
variable Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Chlorophyll_a_.µg.L. 0.06 0.08 0.18 0.03
Phaeophytin_a_.µg.L. 0.04 0.08 0.15 0.02
PN_.µM. 0.35 0.46 0.22 0.10
POC_.µM. 2.86 1.89 2.29 1.00
PP_.µM. 0.02 0.02 0.03 0.01
DOC_.µM. 5.99 9.89 4.60 4.67
PO4_.µM. 0.03 0.02 0.02 0.02
NH4_.µM. 0.15 0.27 0.44 0.06
NO2_.µM. 0.02 0.01 0.02 0.00
NO3_.µM. 0.25 0.15 0.31 0.16
Si_.µM. 0.30 0.44 0.55 0.65
TDN_.µM. 0.83 0.82 0.72 0.75
TDP_.µM. 0.03 0.04 0.03 0.02
TSS_.mg.L. 0.41 0.15 0.52 0.10
SEAWATER_TEMPERATURE_2.5m_RV 0.43 0.61 0.39 0.95
SALINITY_2.5m_RV 0.21 0.17 0.05 0.04
FLUORESCENCE_2.5m_RV 0.01 0.02 0.05 0.12
# Exporting Medians as csv  
write.csv(wq_median_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_Median_per_trip.csv", quote = F, row.names = F)

# Exporting Means as csv
write.csv(wq_mean_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_mean_per_trip.csv", quote = F, row.names = F)

# Exporting SD as csv
write.csv(wq_sd_per_trip, file = "/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Supplementary_Tables/Table_WQ_SD_per_trip.csv", quote = F, row.names = F)

# The csv files on median and sd values were merged manually to make the Table 1 in the main text of the manuscript

Microbial metagenomic data - collection, pre-processing, and analysis

Raw counts were exported from MEGAN as biom files separately for (1) microbial taxonomy (genus level as the lowest category) and for (2) microbial functions (GO terms), and subsequently imported into R using the phyloseq R package. These biom files were combined with the metadata file to create 2 phyloseq objects (for taxa and genes), which have then undergone various filtering steps.

### Importing the biom tables, exported from MEGAN

### Taxonomy info | at 'Genus' level
megan_genus <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_full_dataset_Genera_191_samples_Neg_controls_July_2024.biom")
### Functional info | GO terms
megan_GO_5 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_GOs_Rank5_191_samples_Neg_controls_July_2024.biom")
# This one has 7476 GO terms
megan_GO_4 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_GOs_Rank4_191_samples_Neg_controls_July_2024.biom")
# This one has 5257 GO terms
megan_GO_3 <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_GOs_Rank3_191_samples_Neg_controls_July_2024.biom")
# This one has 705 GO terms

# Let's just modify the metadata file a bit to include neg controls as well
metadata_neg_controls <- left_join(megan_genus@otu_table %>% 
                                     t() %>% 
                             as.data.frame() %>% 
                             rownames_to_column("Sample_ID") %>% 
                               dplyr::select("Sample_ID"),
                           metadata %>% 
                             rownames_to_column("Sample_ID")) %>% 
  column_to_rownames("Sample_ID")

# Merging:
sample_data(megan_genus) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_5) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_4) <- sample_data(metadata_neg_controls)
sample_data(megan_GO_3) <- sample_data(metadata_neg_controls)

# Checking the phyloseq objects
megan_genus
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2066 taxa and 207 samples ]
## sample_data() Sample Data:       [ 207 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 2066 taxa by 7 taxonomic ranks ]
megan_GO_5
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7476 taxa and 207 samples ]
## sample_data() Sample Data:       [ 207 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 7476 taxa by 6 taxonomic ranks ]
megan_GO_4
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 5257 taxa and 207 samples ]
## sample_data() Sample Data:       [ 207 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 5257 taxa by 4 taxonomic ranks ]
megan_GO_3
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 706 taxa and 207 samples ]
## sample_data() Sample Data:       [ 207 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 706 taxa by 3 taxonomic ranks ]
# But I want to filter out the PF samples, and Broomfield rep 2 (because the sequencing was repeated for this one)
megan_genus <- subset_samples(megan_genus, sample_names(megan_genus)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_genus)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_genus)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_genus)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_genus)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_genus)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_genus)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_genus)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_genus)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_genus)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_genus)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_genus)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_genus)!='Broomfield-2_S50_R1')
megan_GO_5 <- subset_samples(megan_GO_5, sample_names(megan_GO_5)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_GO_5)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_GO_5)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_GO_5)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_GO_5)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_GO_5)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_GO_5)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_GO_5)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_GO_5)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_GO_5)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_GO_5)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_GO_5)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_GO_5)!='Broomfield-2_S50_R1')
megan_GO_4 <- subset_samples(megan_GO_4, sample_names(megan_GO_4)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_GO_4)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_GO_4)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_GO_4)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_GO_4)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_GO_4)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_GO_4)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_GO_4)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_GO_4)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_GO_4)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_GO_4)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_GO_4)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_GO_4)!='Broomfield-2_S50_R1')
megan_GO_3 <- subset_samples(megan_GO_3, sample_names(megan_GO_3)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_GO_3)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_GO_3)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_GO_3)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_GO_3)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_GO_3)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_GO_3)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_GO_3)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_GO_3)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_GO_3)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_GO_3)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_GO_3)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_GO_3)!='Broomfield-2_S50_R1')

# Checking the object again
megan_genus
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2066 taxa and 194 samples ]
## sample_data() Sample Data:       [ 194 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 2066 taxa by 7 taxonomic ranks ]
megan_GO_5
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7476 taxa and 194 samples ]
## sample_data() Sample Data:       [ 194 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 7476 taxa by 6 taxonomic ranks ]
megan_GO_4
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 5257 taxa and 194 samples ]
## sample_data() Sample Data:       [ 194 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 5257 taxa by 4 taxonomic ranks ]
megan_GO_3
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 706 taxa and 194 samples ]
## sample_data() Sample Data:       [ 194 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 706 taxa by 3 taxonomic ranks ]

Processing data

After removing the non pre-filtered samples, further data filtering included removal of reads (1) annotated as eukaryotic or viral; and (2) rare/spurious reads. Data was then Center-Log-Ratio (hereinafter ‘CLR’) transformed for statistical analysis in the mixOmics R package.

Removal of Eukaryotic contamination

We annotated a total of 1919 microbial taxa (lowest category: genus level). Reads that were annotated as Eukarya (729 taxa in total) and viruses (11 viral annotations) were excluded from the analysis. Further analysis was performed on a phyloseq object with prokaryotic annotations only, a total of 1179 bacterial and archaeal groups (Figure 2, Table 1).

# Before plotting the bar plots, I first need to prepare my objects
### Taxonomy info | at 'Genus' level
megan_genus_all <- megan_genus
megan_genus_TAX_all <- as.data.frame(megan_genus_all@tax_table)

# Plot admixture barplot - Domain level (and viruses)
cols_domain <- c("slategray3", # Archaea
                 "grey45", # Bacteria
                 "salmon", # Eukaryota
                 "violetred", # f__Mimiviridae
                 "steelblue3", # f__Phycodnaviridae
                 "lightsteelblue4", # f__Retroviridae
                 "seashell4") # o__Caudovirales

DOMAIN <- as.data.frame(megan_genus_all@otu_table) %>% 
  rownames_to_column("OTUs") %>% # I will need this later to add taxonomy info
  left_join(megan_genus_TAX_all %>% rownames_to_column("OTUs")) %>% # adding taxonomy info
  column_to_rownames("OTUs") %>% 
  group_by(Rank1) %>% 
  # Keeping only numerical values now
  summarise_if(.predicate = function(x) is.numeric(x),
               .funs = funs(sum)) # Computing sums
# Now relative abundances
DOMAIN_RA <- DOMAIN
for (i in 2:(ncol(DOMAIN_RA))) {
  DOMAIN_RA[i] <- DOMAIN_RA[i] / sum(DOMAIN_RA[i]) 
}

barplots_domain <- DOMAIN_RA %>%
  column_to_rownames("Rank1") %>% 
  t() %>%
  as.data.frame() %>% 
  rownames_to_column("Sample_ID") %>% 
  reshape2::melt() %>% 
  left_join(metadata %>% rownames_to_column("Sample_ID")) %>% 
# Plotting now!
  ggplot(aes(x=Sample_ID, y=value, fill=variable))+
  geom_bar(stat = "identity")+
  scale_y_continuous(expand = c(0,0))+
  facet_wrap(~Sampling_trip, scales = "free", nrow = 5)+
#  facet_grid(~Sampling_trip, scales = "free_x", space = "free")+
  scale_fill_manual(values = cols_domain)+
  ylab("Relative abundance of taxa (at Domain level)")+
  xlab("Reef sites")+
  theme(axis.text.x = # element_blank(),
        element_text(angle = 75, hjust = 1, size = 12),
        #axis.ticks.x = element_blank(),
        #axis.title.x = element_blank(),
        strip.text = element_text(colour="black", size=12),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_blank(),
        legend.text = element_text(size = 12))
barplots_domain

We see that we have eukaryotic reads, let’s see how many taxa?

megan_genus_bacteria <- subset_taxa(megan_genus, # Phyloseq object with all OTUs
                           Rank1=="d__Bacteria") # The phyloseq object with raw counts
megan_genus_archaea <- subset_taxa(megan_genus, # Phyloseq object with all OTUs
                                    Rank1=="d__Archaea") # The phyloseq object with raw counts

megan_genus_PROKS <- merge_phyloseq(megan_genus_bacteria,
                                    megan_genus_archaea) # Phyloseq object with Proks only
megan_genus_EUKS <- subset_taxa(megan_genus_all,
                          Rank1=="d__Eukaryota") # Phyloseq object with Euks only
knitr::kable(as.data.frame(cbind(as.character(ntaxa(megan_genus_EUKS)), as.character(ntaxa(megan_genus_bacteria)), as.character(ntaxa(megan_genus_archaea)), as.character(ntaxa(megan_genus_PROKS)))), caption = "Taxonomic breakdown", col.names = c("Eukaryota", "Bacteria", "Archaea", "Prokarya"))
Taxonomic breakdown
Eukaryota Bacteria Archaea Prokarya
774 1212 45 1257

If we compare with abundances of prokaryotes (which are the target for this study), are there any euks that are highly abundant?

megan_genus_all_with_euks <- import_biom("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Data_analysis/Seawater/testing_Igor_Antti_R_script/All_seawater_full_dataset/input_files/IMOS_MGD_Megan_biom_files_191_samples_and_Neg_controls_July_2024/IMOS-MGD_Seawater_full_dataset_Genera_191_samples_Neg_controls_July_2024.biom")
# Merging!
sample_data(megan_genus_all_with_euks) <- sample_data(metadata_neg_controls)
# Removing the PF samples and Broomfield 2 (seq failed for this rep, and we have the repeated sample for rep 2)
megan_genus_all_with_euks <- subset_samples(megan_genus_all_with_euks, sample_names(megan_genus_all_with_euks)!='Lynchs-PF-1_S107_R1' &
                                sample_names(megan_genus_all_with_euks)!='Lynchs-PF-2_S108_R1' &
                                sample_names(megan_genus_all_with_euks)!='Lynchs-PF-3_S109_R1' &
                                sample_names(megan_genus_all_with_euks)!='Lynchs-PF-4_S110_R1' &
                                sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-1_S111_R1' &
                                sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-2_S112_R1' &
                                sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-3_S113_R1' &
                                sample_names(megan_genus_all_with_euks)!='Myrmidon-PF-4_S114_R1' &
                                sample_names(megan_genus_all_with_euks)!='Rib-PF-1_S103_R1' &
                                sample_names(megan_genus_all_with_euks)!='Rib-PF-2_S104_R1' &
                                sample_names(megan_genus_all_with_euks)!='Rib-PF-3_S105_R1' &
                                sample_names(megan_genus_all_with_euks)!='Rib-PF-4_S106_R1' &
                                sample_names(megan_genus_all_with_euks)!='Broomfield-2_S50_R1')

# Removing the non-annotated stuff!
megan_genus_all_anno_only <- subset_taxa(megan_genus_all_with_euks, Rank2!="NA")
# Getting relative abundances too
megan_genus_all_RA = transform_sample_counts(megan_genus_all_with_euks, function(x) x / sum(x) )

# Selecting the top 100 most abundant MAGs (based on RA data)
megan_genus_top200_RA_abund_with_euks <- taxa_sums(megan_genus_all_RA) %>%
  sort(decreasing = TRUE) %>%
  head(200) %>% # Taking the first X most abundant taxa.
  # Change the number depending on how many Genera I want to look at
  names()

# Making a new phyloseq object
megan_genus_top200_RA_with_euks <- prune_taxa(megan_genus_top200_RA_abund_with_euks, # These are the top 20
                                  megan_genus_all_RA)

# Defining breaks - to make sure even very lowly abundant taxa will be visible!
# From Steve:
breaks=c(0,0.001,0.01,0.05,0.1,0.25,0.4,0.5,0.6,0.7,1)

# But I want to have less breaks
# breaks_5=c(0,0.001,0.1,0.3,0.7,1)
# Plot heatmap
left_join(otu_table(megan_genus_top200_RA_with_euks) %>% as.data.frame %>% rownames_to_column("OTU"),
          tax_table(megan_genus_top200_RA_with_euks) %>% as.data.frame %>% rownames_to_column("OTU")) %>%
  arrange(match(OTU, megan_genus_top200_RA_abund_with_euks)) %>% # Arranging by abundances here
  unite(taxonomy, c(OTU, Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") %>% # Adding Taxonomy info
  gather(Sample_ID, Reads, -taxonomy) %>% # 'Reads' contains the Raw counts
#  left_join(as.data.frame(sample_data(megan_genus_all_RA)) %>% rownames_to_column("Sample_ID")) %>% # Now joining with the metadata
  left_join(metadata %>% rownames_to_column("Sample_ID")) %>% 
  # Ready to plot now!
  ggplot(aes(x = Sample_ID, # Short reef names on the x axis
             y = reorder(taxonomy, # Taxonomy info on the y axis
                         Reads), # With Taxa ordered based on abundances, most abundant listed first
             fill = Reads)) + # Change to 'Reads' if plotting the raw counts
  geom_tile() + # This colors the heatmap in blue & makes the more abundant taxa darker in color
  facet_grid(cols = vars(Sampling_trip), scales = "free_x", space = "free") + # Splitting in facets
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) + # Rotating the text at 90 degrees angle
  # Use this if I want a smaller number of breaks
#  scale_fill_stepsn(breaks = breaks_5, colours =c("white", # for the 0-0.001 RA range!
#                                                "slategray1", "slategray2", "slategray3", "slategray4")) +
  # Or from Steve:
  scale_fill_stepsn(breaks = breaks, colours =c("white", # for the 0-0.001 RA range!
                                                  "slategray4", # 001 - 0.01
                                                  "slategray3", # 0.01 - 0.05
                                                  "slategray2", # 0.05 - 0.1
                                                  "navajowhite", # 0.1 - 0.25
                                                  "rosybrown2", # 0.25,0.4
                                                  "lightsalmon", # 0.4 - 0.5
                                                  "rosybrown1", # 0.5 - 0.6
#                                                  "lightgoldenrod1", # 0.6 - 0.7
                                                  "indianred2")) + # 0.7,1
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0))

Removal of Rare/Spurious Reads - prokaryotes only

Prior to removal of rare and spurious reads, non-annotated reads were removed from the dataset. We then computed relative abundance values (RA) and removed reads with average RA < 0.0001% across samples. After removing OTUs that were less than 0.0001% abundant, we retained 618 taxa (primarily at Genus level) out of the initial 1179 prokaryotic OTUs. At functional level, we retained 5015 GO annotations (out of 8689 GO terms).

### IMPORTANT - ***Change this part of the script*** depending on which phyloseq object I would like to look at: prokaryotic, eukaryotic or all. This way I wouldn't need to modify multiple lines in the script below
megan_genus <- megan_genus_PROKS # options to choose from: megan_genus_PROKS, megan_genus_EUKS

# Cleaning the names here already! This way I will make sure every other phyloseq object will have organised taxonomy
megan_genus_TAX_PROKS <- as.data.frame(megan_genus@tax_table)

# Unite the names within one column called "Taxonomy"
megan_genus_TAX_PROKS <- megan_genus_TAX_PROKS %>% 
  unite(Taxonomy, c(Rank1, Rank2, Rank3, Rank4, Rank5, Rank6, Rank7), sep = "; ") # Adding Taxonomy info

# Initialize empty columns
megan_genus_TAX_PROKS$Domain <- NA
megan_genus_TAX_PROKS$Phylum <- NA
megan_genus_TAX_PROKS$Class <- NA
megan_genus_TAX_PROKS$Order <- NA
megan_genus_TAX_PROKS$Family <- NA
megan_genus_TAX_PROKS$Genus <- NA
megan_genus_TAX_PROKS$Species <- NA

# Categorise taxonomic strings based on patterns:
megan_genus_TAX_PROKS$Domain <- str_match(megan_genus_TAX_PROKS$Taxonomy, "^d__(.+?);")[, 2]
megan_genus_TAX_PROKS$Phylum <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; p__(.+?);")[, 2]
megan_genus_TAX_PROKS$Class <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; c__(.+?);")[, 2]
megan_genus_TAX_PROKS$Order <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; o__(.+?);")[, 2]
megan_genus_TAX_PROKS$Family <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; f__(.+?);")[, 2]
megan_genus_TAX_PROKS$Genus <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; g__(.+?);")[, 2]
megan_genus_TAX_PROKS$Species <- str_match(megan_genus_TAX_PROKS$Taxonomy, "; s__(.+?);")[, 2]

# Last thing: replacing missing values with "Unknown_*"
megan_genus_TAX_PROKS$Domain[is.na(megan_genus_TAX_PROKS$Domain)] <- "Unknown Domain"
megan_genus_TAX_PROKS$Phylum[is.na(megan_genus_TAX_PROKS$Phylum)] <- "Unknown Phylum"
megan_genus_TAX_PROKS$Class[is.na(megan_genus_TAX_PROKS$Class)] <- "Unknown Class"
megan_genus_TAX_PROKS$Order[is.na(megan_genus_TAX_PROKS$Order)] <- "Unknown Order"
megan_genus_TAX_PROKS$Family[is.na(megan_genus_TAX_PROKS$Family)] <- "Unknown Family"
megan_genus_TAX_PROKS$Genus[is.na(megan_genus_TAX_PROKS$Genus)] <- "Unknown Genus"
megan_genus_TAX_PROKS$Species[is.na(megan_genus_TAX_PROKS$Species)] <- "Unknown Species"

# Remove the original taxonomy column
megan_genus_TAX_PROKS <- megan_genus_TAX_PROKS %>% 
  dplyr::select(Domain, Phylum, Class, Order, Family, Genus, Species)
# All cleaned up! :) thanks ChatGPT

### Putting this back into the phyloseq object:
# First checking the current taxonomic names in phyloseq object
current_taxa_names <- taxa_names(megan_genus)
# Compare with tax_table column names and order
polished_tax_table <- colnames(t(megan_genus_TAX_PROKS))
# Check if they match
if (!identical(current_taxa_names, polished_tax_table)) {
  stop("Polished taxonomic names in megan_genus_TAX_PROKS do not match the taxa_names in the megan_genus phyloseq object.")
}
# Looks like they match! So I'm not sure why I cannot merge them (code below)

# Check dimensions of tax_table and physeq - does the number of rows match?
nrow_tax_table <- nrow(megan_genus_TAX_PROKS)
ntaxa_physeq <-ntaxa(megan_genus)  # Number of taxa in physeq

if (nrow_tax_table != ntaxa_physeq) {
  stop("Number of rows in megan_genus_TAX_PROKS does not match the number of taxa in megan_genus phyloseq object.")
}

# Step 3: Compare order of unique values
identical(megan_genus_TAX_PROKS %>% # Looking for OTU order for the polishes taxa
            rownames_to_column("OTUs") %>%
            dplyr::select("OTUs"),
          megan_genus@tax_table %>%  # Looking for OTU order in the current phyloseqq object
            as.data.frame() %>% 
            rownames_to_column("OTUs") %>% 
            dplyr::select("OTUs"))
## [1] TRUE
# Again, this is also the same

# Here too:
identical(row.names(megan_genus_TAX_PROKS),
          row.names(otu_table(megan_genus))
          )
## [1] TRUE
# Yes, the row names are identical

# Now adding this polished taxonomy to my phyloseq object:
tax_table(megan_genus) <- as.matrix(megan_genus_TAX_PROKS)

### Removing the negative controls too - taxa phyloseq object:
megan_genus_no_neg_control <- subset_samples(megan_genus,
                                             sample_names(megan_genus)!='Neg-control-1_S101_R1' &
                                               sample_names(megan_genus)!='Neg-control-2_S24_R1' &
                                               sample_names(megan_genus)!='Neg-control-3_S116_R1')

### Removing the negative controls too - GOs at rank 5 phyloseq object:
megan_GO_5_no_neg_control <- subset_samples(megan_GO_5,
                                             sample_names(megan_GO_5)!='Neg-control-1_S101_R1' &
                                               sample_names(megan_GO_5)!='Neg-control-2_S24_R1' &
                                               sample_names(megan_GO_5)!='Neg-control-3_S116_R1')

### Removing the negative controls too - GOs at rank 4 phyloseq object:
megan_GO_4_no_neg_control <- subset_samples(megan_GO_4,
                                             sample_names(megan_GO_4)!='Neg-control-1_S101_R1' &
                                               sample_names(megan_GO_4)!='Neg-control-2_S24_R1' &
                                               sample_names(megan_GO_4)!='Neg-control-3_S116_R1')

### Removing the negative controls too - GOs at rank 3 phyloseq object:
megan_GO_3_no_neg_control <- subset_samples(megan_GO_3,
                                             sample_names(megan_GO_3)!='Neg-control-1_S101_R1' &
                                               sample_names(megan_GO_3)!='Neg-control-2_S24_R1' &
                                               sample_names(megan_GO_3)!='Neg-control-3_S116_R1')
### Instead of setting an arbitrary threshold (e.g 100 seqs), I would like to filter based on relative abundances (***removing all OTUs < 0.0001% rel. abundance***)

# Tutorial I used: https://joey711.github.io/phyloseq/preprocess.html

### Removing reads that are annotated at Bacteria or Archaea levels only - not informative!
megan_genus_anno_only <- subset_taxa(megan_genus_no_neg_control, Domain!="NA")

### Removing reads that were not annotated at Rank2 level - not informative!
megan_GO_5_anno_only <- subset_taxa(megan_GO_5_no_neg_control, Rank2!="NA")
megan_GO_4_anno_only <- subset_taxa(megan_GO_4_no_neg_control, Rank2!="NA")
megan_GO_3_anno_only <- subset_taxa(megan_GO_3_no_neg_control, Rank2!="NA")

# Getting the taxa data frame
megan_genus_TAX <- as.data.frame(megan_genus_anno_only@tax_table)
# Getting the taxa data frame
megan_GO_5_FUN <- as.data.frame(megan_GO_5_anno_only@tax_table)
megan_GO_4_FUN <- as.data.frame(megan_GO_4_anno_only@tax_table)
megan_GO_3_FUN <- as.data.frame(megan_GO_3_anno_only@tax_table)

### Getting the relative abundances
# Taxa
megan_genus_RA = transform_sample_counts(megan_genus_anno_only, function(x) x / sum(x) )
# GO terms
megan_GO_5_RA  = transform_sample_counts(megan_GO_5_anno_only, function(x) x / sum(x) )
megan_GO_4_RA  = transform_sample_counts(megan_GO_4_anno_only, function(x) x / sum(x) )
megan_GO_3_RA  = transform_sample_counts(megan_GO_3_anno_only, function(x) x / sum(x) )
# removing all OTUs that are less than 0.0001% abundant
megan_genus_RA_no_rare = filter_taxa(megan_genus_RA, function(x) mean(x) > 1e-6, TRUE)
# removing all genes that are less than 0.0001% abundant
megan_GO_5_RA_no_rare = filter_taxa(megan_GO_5_RA, function(x) mean(x) > 1e-6, TRUE)
megan_GO_3_RA_no_rare = filter_taxa(megan_GO_3_RA, function(x) mean(x) > 1e-6, TRUE)
megan_GO_4_RA_no_rare = filter_taxa(megan_GO_4_RA, function(x) mean(x) > 1e-6, TRUE)
Before_after_filtering <- cbind(rbind(ntaxa(megan_genus), ntaxa(megan_genus_RA_no_rare)),
                                rbind(ntaxa(megan_GO_5), ntaxa(megan_GO_5_RA_no_rare))
#                                rbind(ntaxa(megan_COGs), ntaxa(megan_COGs_RA_no_rare))
                                ) %>% 
  as.data.frame()
# Adding row names now
row.names(Before_after_filtering) <- c("Before filtering", "After filtering")

knitr::kable(Before_after_filtering, caption = "Removal of Rare/Spurious reads (< 0.0001% RA)", col.names = c("Taxa", "GO terms"), row.names = T)
Removal of Rare/Spurious reads (< 0.0001% RA)
Taxa GO terms
Before filtering 1257 7476
After filtering 561 4287
megan_genus_abundant <- prune_taxa(taxa_names(megan_genus_RA_no_rare), # List of OTUs after filtering
                                megan_genus_no_neg_control) # My phyloseq object with raw counts
megan_GO_5_abundant <- prune_taxa(taxa_names(megan_GO_5_RA_no_rare), # List of OTUs after filtering
                                megan_GO_5_no_neg_control) # My phyloseq object with raw counts
megan_GO_4_abundant <- prune_taxa(taxa_names(megan_GO_4_RA_no_rare), # List of OTUs after filtering
                                megan_GO_4_no_neg_control) # My phyloseq object with raw counts
megan_GO_3_abundant <- prune_taxa(taxa_names(megan_GO_3_RA_no_rare), # List of OTUs after filtering
                                megan_GO_3_no_neg_control) # My phyloseq object with raw counts
# CLR is the normalisation method suggested by the mixOmics R package for microbial data - a way to address missing values that are characteristic of microbial datasets. I need to remove missing values before doing the CLR normalisation - The geometric mean cannot be determined for sparse data without deleting, replacing or estimating the 0 count values. So I am introducing pseudo counts

### Tutorial used: http://mixomics.org/mixmc/mixmc-preprocessing/

# Checking if there are any zeros - BEFORE adding pseudocounts
sum(which(megan_genus_abundant@otu_table == 0))
## [1] 3698620822
sum(which(megan_GO_5_abundant@otu_table == 0))
## [1] 43240551782
# sum(which(megan_COGs_abundant@otu_table == 0))
# Pseudocounts - replacing all zero vals with 1; 
megan_genus_abundant@otu_table <- megan_genus_abundant@otu_table + 1
megan_GO_5_abundant@otu_table <- megan_GO_5_abundant@otu_table +1
megan_GO_3_abundant@otu_table <- megan_GO_3_abundant@otu_table +1
megan_GO_4_abundant@otu_table <- megan_GO_4_abundant@otu_table +1
# megan_COGs_abundant@otu_table <- megan_COGs_abundant@otu_table + 1
# Checking if there are any zeros - AFTER adding pseudocounts
sum(which(megan_genus_abundant@otu_table == 0))
## [1] 0
sum(which(megan_GO_5_abundant@otu_table == 0))
## [1] 0
# sum(which(megan_COGs_abundant@otu_table == 0))
# All good! No NAs after introducing pseudocounts

### Now I can CLR transform when running analyses in mixOmics!
# I am using an option from the microbiome R package, not the same as in MixOmics.
megan_genus_clr <- microbiome::transform(megan_genus_abundant, "clr")
megan_go_clr_5 <- microbiome::transform(megan_GO_5_abundant, "clr")
megan_go_clr_3 <- microbiome::transform(megan_GO_3_abundant, "clr")
megan_go_clr_4 <- microbiome::transform(megan_GO_4_abundant, "clr")
# megan_COGs_clr <- microbiome::transform(megan_COGs_abundant, "clr")

# megan_go_clr_3_bp <- megan_go_clr_3 %>% 
#  subset_taxa(Rank2 == 'GO:0008150 biological_process')
# megan_go_clr_4_bp <- megan_go_clr_4 %>% 
#  subset_taxa(Rank2 == 'GO:0008150 biological_process')
# But for GO at lvl 3, I only want bio process
# save.image("/home/markoterzin/Documents/PhD/Thesis/Chapter_2/Paper_drafts/the_analysis_is_finalised/Code/Code_for_Terzin_et_al_Microbial_Function_Outperforms_Taxonomy_in_Inferring_Water_Chemistry_across_the_Great_Barrier_Reef.RData")

Principal Components Analysis (PCA) - What are the main clustering patterns between our reefs based on physico-chemical data?

# Preparing the object to have taxa names on boxplots
OTUs_biplot <- as.data.frame(megan_genus_clr@otu_table) %>% 
  t() # mixOmics needs samples and microbes to be reordered, so transposing here
# Check dimensions of data
dim(OTUs_biplot)
## [1] 191 561
class(OTUs_biplot)
## [1] "matrix" "array"
# Getting taxa names
OTUs_biplot_colnames_for_biplot <- left_join(otu_table(megan_genus_clr) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_genus_clr) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU")) %>%
  unite(taxonomy, c(Family, Genus), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
OTUs_biplot_colnames_for_biplot <- OTUs_biplot_colnames_for_biplot %>% 
  dplyr::select("OTU", "taxonomy")
# Merging with the OTUs_biplot object
OTUs_biplot_names <- left_join(t(OTUs_biplot) %>% 
                           as.data.frame() %>% 
                           rownames_to_column("OTU"),
                         OTUs_biplot_colnames_for_biplot) %>% 
  unite(Annotations, c(OTU, taxonomy), sep = "_") %>% 
  column_to_rownames("Annotations") %>%  # moving this as rowposing back into the right format
  t() # trans

# PCA
result.pca.taxa.names <- pca(OTUs_biplot_names)

# Plotting the PCA sample plot
plotIndiv(result.pca.taxa.names,
          group = sample_data(megan_genus_abundant)$Sampling_trip,
          title = 'PCA | Microbial Taxonomy',
          legend = T,
          ellipse = TRUE,
          ind.names = F,
          col.per.group =c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue"), # Sampling trip 4
          legend.title = 'Sampling trip'
          )

# Plotting the PCA biplot
biplot(result.pca.taxa.names,
       comp = c(1, 2),
       group = sample_data(megan_genus_abundant)$Sampling_trip,
       ind.names = F,
       ellipse = T,
       col.per.group =c("indianred", # Sampling trip 1
                        "indianred4", # Sampling trip 2
                        "red3", # Sampling trip 3
                        "slateblue"), # Sampling trip 4
       legend = TRUE,
       vline = T,
       hline = T,
       cutoff = 0.65,
       legend.title = "Sampling trip")

# Parameter tuning
tune.pca.taxa <- tune.pca(OTUs_biplot_names, ncomp = 10, scale = TRUE)
plot(tune.pca.taxa)
Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial taxonomy): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial taxonomy): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

# Numerical output
pca.taxa.num <- pca(OTUs_biplot_names, # getting the numerical values only
                       ncomp = 10,
                       center = TRUE,
                       scale = TRUE)

# Explained variance per PCA component
knitr::kable(pca.taxa.num$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
The proportion of explained variance per each PCA component is:
x
PC1 0.2030513
PC2 0.1235319
PC3 0.0686204
PC4 0.0537983
PC5 0.0456617
PC6 0.0431242
PC7 0.0303674
PC8 0.0226126
PC9 0.0216532
PC10 0.0178213
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.taxa.num$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
The cumulative proportion of variance explained by each PCA component
x
PC1 0.2030513
PC2 0.3265833
PC3 0.3952037
PC4 0.4490020
PC5 0.4946637
PC6 0.5377879
PC7 0.5681553
PC8 0.5907679
PC9 0.6124211
PC10 0.6302423
# Preparing the object to have taxa names on boxplots
GOs_biplot <- as.data.frame(megan_go_clr_5@otu_table) %>% 
  t() # mixOmics needs samples and microbes to be reordered, so transposing here
# Check dimensions of data
dim(GOs_biplot)
## [1]  191 4287
class(GOs_biplot)
## [1] "matrix" "array"
# Getting gene names
GOs_biplot_colnames_for_biplot <- left_join(otu_table(megan_go_clr_5) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU"),
                                  tax_table(megan_go_clr_5) %>%
                                    as.data.frame %>%
                                    rownames_to_column("OTU")) %>%
  unite(Gene_annotations, c(Rank4), sep = "; ") # Adding Taxonomy info
## Joining, by = "OTU"
GOs_biplot_colnames_for_biplot <- GOs_biplot_colnames_for_biplot %>% 
  dplyr::select("OTU", "Gene_annotations")
# Merging with the OTUs_biplot object
GOs_biplot_names <- left_join(t(GOs_biplot) %>% 
                           as.data.frame() %>% 
                           rownames_to_column("OTU"),
                         GOs_biplot_colnames_for_biplot) %>% 
  unite(Annotations, c(OTU, Gene_annotations), sep = "_") %>% 
  column_to_rownames("Annotations") %>%  # moving this as rowposing back into the right format
  t() # trans

# PCA
result.pca.GOs.names <- pca(GOs_biplot_names)

# Plotting the PCA sample plot
plotIndiv(result.pca.GOs.names,
          group = sample_data(megan_GO_5_abundant)$Sampling_trip,
          title = 'PCA | Microbial Functions',
          legend = T,
          ellipse = TRUE,
          ind.names = F,
          col.per.group =c("indianred", # Sampling trip 1
                "indianred4", # Sampling trip 2 
                "red3", # Sampling trip 3
                "slateblue"), # Sampling trip 4
          legend.title = 'Sampling trip'
          )

# Plotting the PCA biplot
biplot(result.pca.GOs.names,
       comp = c(1, 2),
       group = sample_data(megan_GO_5_abundant)$Sampling_trip,
       ind.names = F,
       ellipse = T,
       col.per.group =c("indianred", # Sampling trip 1
                        "indianred4", # Sampling trip 2
                        "red3", # Sampling trip 3
                        "slateblue"), # Sampling trip 4
       legend = TRUE,
       vline = T,
       hline = T,
       cutoff = 0.95,
       legend.title = "Sampling trip")

# Parameter tuning
tune.pca.GOs <- tune.pca(GOs_biplot_names, ncomp = 10, scale = TRUE)
plot(tune.pca.GOs)
Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial genes - GO terms): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

Screeplot from the PCA performed on the IMOS GBR-MGD metagenomics data (microbial genes - GO terms): Amount of explained variance for each principal component is shown. From the numerical output (shown bellow in tabular format), we observe that the first two principal components explain 60.31% of the total variance. The rule of thumb for choosing the number of PCA components is not so much to set a hard threshold based on the cumulative proportion of explained variance (as this is data-dependent), but to observe when a drop, or elbow, appears on the screeplot. The elbow indicates that the remaining variance is spread over many principal components and is not relevant in obtaining a low-dimensional ‘snapshot’ of the data. Based on this, we chose to keep two PCA dimensions.

# Numerical output
pca.GOs.num <- pca(GOs_biplot_names, # getting the numerical values only
                       ncomp = 10,
                       center = TRUE,
                       scale = TRUE)

# Explained variance per PCA component
knitr::kable(pca.GOs.num$prop_expl_var$X, caption = "The proportion of explained variance per each PCA component is:")
The proportion of explained variance per each PCA component is:
x
PC1 0.3436569
PC2 0.1184468
PC3 0.0592101
PC4 0.0477584
PC5 0.0386741
PC6 0.0292849
PC7 0.0203148
PC8 0.0186514
PC9 0.0143016
PC10 0.0118723
# The cumulative proportion of variance explained by each PCA component
knitr::kable(pca.GOs.num$cum.var, caption = "The cumulative proportion of variance explained by each PCA component")
The cumulative proportion of variance explained by each PCA component
x
PC1 0.3436569
PC2 0.4621037
PC3 0.5213138
PC4 0.5690722
PC5 0.6077463
PC6 0.6370312
PC7 0.6573461
PC8 0.6759974
PC9 0.6902990
PC10 0.7021713

Parameter tuning in mixOmics to identify the optimal number of principal components (PCs) showed that the variance explained by adding more than 2 PCs is insignificant for both taxonomy and function. Hence, 2 PCs were retained. PCA clustering identified a clear difference between summer and winter samples, for both taxonomy and function. However, this clustering becomes more evident at functional compared to taxonomic levels. The percentage of variance explained by the first 2 Principal components (PCs) equaled to ~26% for taxonomy, and 55% for functions (GO terms). A PERMANOVA test was then carried out to investigate which comparisons are statistically significant.

PERMANOVA

Analysis of similarities (ANOSIM) testing whether there is a statistically significant difference between two or more groups of sampling units - sampling trips. We will then perform a Pairwise PERMANOVA.

taxa.anosim <- left_join(otu_table(megan_genus_RA_no_rare) %>%
                           as.data.frame %>%
                           rownames_to_column("OTU"),
                         megan_genus_TAX %>%
                           rownames_to_column("OTU")) %>% 
  unite(taxonomy, c(OTU, Domain, Phylum, Class, Order, Family, Genus, Species), sep = "; ") %>% 
  column_to_rownames("taxonomy")
# Removing rows with NAs, because ANOSIM does not take in missing vals
taxa.anosim <- na.omit(taxa.anosim)

# Object is ready to perform the test
ano_taxa <- anosim(t(taxa.anosim), 
                   sample_data(megan_genus_RA_no_rare)$Sampling_trip, 
                   distance = "bray", 
                   permutations = 9999)
# Results
ano_taxa
## 
## Call:
## anosim(x = t(taxa.anosim), grouping = sample_data(megan_genus_RA_no_rare)$Sampling_trip,      permutations = 9999, distance = "bray") 
## Dissimilarity: bray 
## 
## ANOSIM statistic R: 0.2459 
##       Significance: 1e-04 
## 
## Permutation: free
## Number of permutations: 9999

Pairwise PERMANOVA - taxa

Visualisation

Phylum level

What are the most abundant phyla?

Out of the 29 bacterial and archaeal phyla we identified, the most abundant phyla were Cyanobacteria, Proteobacteria, and Bacteroidetes, respectively. Bacteroidetes increased in abundance for those samples collected during the peak of summer (February 2020), and were lowest in abundances during winter (July 2020).

## Joining with `by = join_by(OTU)`
## Joining with `by = join_by(Sample_ID)`

## [1] 1
Mean relative abundances at Phylum level, across all samples. We observe that 47.27% of the reads cannot be annotated bellow the Phylum level.
Group.1 x
Unknown Phylum 0.4727718
Cyanobacteria 0.3668856
Proteobacteria 0.1317830
Bacteroidetes 0.0126634
Actinobacteria 0.0079718
Planctomycetes 0.0024819
Firmicutes 0.0020133
Verrucomicrobia 0.0010665
Balneolaeota 0.0009227
Thaumarchaeota 0.0003119
Spirochaetes 0.0002799
Euryarchaeota 0.0002221
Candidatus Thermoplasmatota 0.0001400
Fusobacteria 0.0000869
Rhodothermaeota 0.0000807
Lentisphaerae 0.0000689
Tenericutes 0.0000574
Nitrospinae 0.0000417
Candidatus Tectomicrobia 0.0000289
Acidobacteria 0.0000283
Nitrospirae 0.0000279
Chloroflexi 0.0000222
Deinococcus-Thermus 0.0000116
Candidatus Peregrinibacteria 0.0000088
Chlamydiae 0.0000069
Gemmatimonadetes 0.0000057
Candidatus Kaiserbacteria 0.0000054
Candidatus Marinimicrobia 0.0000026
Fibrobacteres 0.0000018
## [1] 4
Mean relative abundances at Phylum level, partitioned per trip.
Group.1 Trip_01_Nov-Dec_2019 Trip_02_January_2020 Trip_03_February_2020 Trip_04_July_2020
Acidobacteria 0.0000001 0.0000393 0.0000321 0.0000382
Actinobacteria 0.0093240 0.0104277 0.0067635 0.0057322
Bacteroidetes 0.0114547 0.0091737 0.0265800 0.0059184
Balneolaeota 0.0014112 0.0009432 0.0011873 0.0003183
Candidatus Kaiserbacteria 0.0000001 0.0000210 0.0000002 0.0000002
Candidatus Marinimicrobia 0.0000108 0.0000001 0.0000002 0.0000002
Candidatus Peregrinibacteria 0.0000085 0.0000195 0.0000079 0.0000006
Candidatus Tectomicrobia 0.0000001 0.0000246 0.0000041 0.0000741
Candidatus Thermoplasmatota 0.0000147 0.0001967 0.0000164 0.0002847
Chlamydiae 0.0000002 0.0000264 0.0000003 0.0000004
Chloroflexi 0.0000001 0.0000265 0.0000250 0.0000335
Cyanobacteria 0.3425560 0.3769217 0.3654358 0.3785127
Deinococcus-Thermus 0.0000121 0.0000344 0.0000003 0.0000004
Euryarchaeota 0.0001920 0.0002252 0.0002329 0.0002349
Fibrobacteres 0.0000001 0.0000065 0.0000002 0.0000002
Firmicutes 0.0019127 0.0020475 0.0018850 0.0021616
Fusobacteria 0.0001014 0.0001372 0.0000967 0.0000250
Gemmatimonadetes 0.0000002 0.0000107 0.0000128 0.0000004
Lentisphaerae 0.0000604 0.0000589 0.0001671 0.0000087
Nitrospinae 0.0000137 0.0000248 0.0000499 0.0000720
Nitrospirae 0.0000144 0.0000409 0.0000229 0.0000311
Planctomycetes 0.0023251 0.0027011 0.0021333 0.0026849
Proteobacteria 0.1309415 0.1327437 0.1313794 0.1319308
Rhodothermaeota 0.0000642 0.0001129 0.0001349 0.0000244
Spirochaetes 0.0001831 0.0002533 0.0002024 0.0004384
Tenericutes 0.0000861 0.0000005 0.0001653 0.0000008
Thaumarchaeota 0.0002969 0.0003423 0.0002543 0.0003419
Unknown Phylum 0.4978213 0.4623094 0.4622533 0.4701345
Verrucomicrobia 0.0011939 0.0011304 0.0009563 0.0009964
go.anosim <- left_join(otu_table(megan_GO_5_RA_no_rare) %>%
                         as.data.frame %>%
                         rownames_to_column("OTU"),
                         megan_GO_5_FUN %>% 
                         rownames_to_column("OTU")) %>% 
  unite(taxonomy, c(OTU, Rank1, Rank2, Rank3, Rank4, Rank5, Rank6#, Rank7, Rank8
                    ), sep = "; ") %>% 
  column_to_rownames("taxonomy")
# Removing rows with NAs, because ANOSIM does not take in missing vals
go.anosim <- na.omit(go.anosim)

# Object is ready to perform the test
ano_go <- anosim(t(go.anosim), 
                   sample_data(megan_GO_5_RA_no_rare)$Sampling_trip, 
                   distance = "bray", 
                   permutations = 9999)
# Results
ano_go
## 
## Call:
## anosim(x = t(go.anosim), grouping = sample_data(megan_GO_5_RA_no_rare)$Sampling_trip,      permutations = 9999, distance = "bray") 
## Dissimilarity: bray 
## 
## ANOSIM statistic R: 0.3743 
##       Significance: 1e-04 
## 
## Permutation: free
## Number of permutations: 9999

Pairwise PERMANOVA - GO terms (rank 5)

Integrating microbial and environmental data

Partial Mantel tests

# Compute the mantel tests - cite the source of where this is coming from!
multimantel<-function(distance,env.df,geo.dist){
  BCdist<-distance
  statistic<-NULL
  pval<-NULL
  n.obs<-NULL
  for (i in 1:ncol(env.df)){
    na.pos<-which(is.na(env.df[,i]))
    if (length(na.pos)>0) tmp<-mantel.partial(as.dist(as.matrix(BCdist)[-c(na.pos),-c(na.pos)]),dist(env.df[-c(na.pos),i]),as.dist(as.matrix(geo.dist)[-c(na.pos),-c(na.pos)]),method = "pearson",permutations = 1000) else tmp<-mantel.partial(BCdist,dist(env.df[,i]),geo.dist,method = "pearson",permutations = 1000)
    statistic<-c(statistic,tmp$statistic)
    pval<-c(pval,tmp$signif)
    n.obs<-c(n.obs,nrow(env.df)-length(na.pos))
  }
  data.frame(var=colnames(env.df),statistic,pval,p.corr=p.adjust(pval,method="bonferroni"),n.obs)
}

### Calculate Bray-Curtis dissimilarities - doing this on the Relative abundance data when rare taxa were excluded  
# Taxonomy
megan_genus_dist <- vegdist(t(otu_table(megan_genus_RA_no_rare)), method = "bray")
# GO terms 
megan_go_dist <- vegdist(t(otu_table(megan_GO_5_RA_no_rare)), method = "bray")

# Getting distances (in km) for IMOS-MGD sites - this is important because the Mantels will be corrected for geography
# Getting distances (in km) for IMOS-MGD sites
metadata_Mantel <- sample_data(megan_genus_clr) %>% 
  as.matrix() %>% 
  as.data.frame() %>% 
  rownames_to_column("Sample_ID")

# Importing the coordinates
map_coords_Mantel <- read.csv("/home/markoterzin/Documents/PhD/IMOS_reporting/IMOS_MGD_metadata/MARKO_for_eReefs_Lats_Longs.csv")
map_coords_Mantel <- left_join(metadata_Mantel[,c(1,2)], map_coords_Mantel, by = c("REEF_NAME" = "name"))
# map_coords <- map_coords %>% remove_rownames %>% column_to_rownames(var="Sample_ID")
map_coords_Mantel$REEF_NAME <- NULL
names(map_coords_Mantel)[names(map_coords_Mantel) == 'Sample_ID'] <- 'name'
# Setting first column as row names
map_coords_Mantel <- map_coords_Mantel %>%
  remove_rownames %>%
  column_to_rownames(var="name")
# Need to reorder as pointDistance() function requires longitude to go first
map_coords_reorder <- map_coords_Mantel %>% 
  relocate(lon, lat)

# Probably better to compute this 4 times for each of the trips, but first need to make sure that this code works
IMOS_mar.dist.mat <- round(pointDistance(map_coords_reorder, lonlat=TRUE) / 1000)
rownames(IMOS_mar.dist.mat) <- metadata_Mantel$Sample_ID
colnames(IMOS_mar.dist.mat) <- metadata_Mantel$Sample_ID

# Trick here: Now adding one column to the front so that I can make the correlation plot for both midshelf and offshore reefs
metadata_Mantel <- cbind(a = 0, metadata_Mantel)

# partial Mantels - microbial taxa
partial_Mantel_taxa_res <- multimantel(as.dist(as.matrix(megan_genus_dist)[metadata_Mantel$a=="0",metadata_Mantel$a=="0"]), # Distance object, doing it only
                                 # for the epipelagic layer
                                 metadata_Mantel[metadata_Mantel$a=="0", colnames(metadata_Mantel[,c(26:42)])], # columns 26-42 will extract numerical values
                                 as.dist(as.matrix(IMOS_mar.dist.mat)[metadata_Mantel$a=="0", metadata_Mantel$a=="0"])) #[env.mat$epi=="EPI", env.mat$epi=="EPI"])) # I only need the geographic 
# distances, in km
knitr::kable(partial_Mantel_taxa_res %>% arrange(abs(statistic)),
             caption = "Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the taxonomic level."
               )
Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the taxonomic level.
var statistic pval p.corr n.obs
median_Si_µM -0.0002110 0.4845155 1.0000000 191
median_DOC_µM -0.0028420 0.5484515 1.0000000 191
median_NO3_µM -0.0047510 0.5024975 1.0000000 191
SALINITY_2.5m_RV -0.0050375 0.5484515 1.0000000 191
FLUORESCENCE_2.5m_RV 0.0124093 0.3266733 1.0000000 191
median_Chlorophyll_A_µg_L 0.0340392 0.1928072 1.0000000 191
median_TDN_µM -0.0352454 0.8861139 1.0000000 191
median_TSS_mg_L -0.0376407 0.8111888 1.0000000 187
median_Phaeophytin_A_µg_L 0.0463218 0.1008991 1.0000000 191
median_NO2_µM 0.0769044 0.0039960 0.0679321 191
median_NH4_µM 0.0867594 0.0139860 0.2377622 191
median_PP_µM 0.1486216 0.0009990 0.0169830 187
median_TDP_µM 0.1989444 0.0009990 0.0169830 191
median_POC_µM 0.2200806 0.0009990 0.0169830 191
median_PN_µM 0.2362877 0.0009990 0.0169830 191
SEAWATER_TEMPERATURE_2.5m_RV 0.2674913 0.0009990 0.0169830 191
median_PO4_µM 0.2823732 0.0009990 0.0169830 191
# partial Mantels - microbial function (GO terms)
partial_Mantel_GOs_res <- multimantel(as.dist(as.matrix(megan_go_dist)[metadata_Mantel$a=="0",metadata_Mantel$a=="0"]), # Distance object, doing it only
                                 # for the epipelagic layer
                                 metadata_Mantel[metadata_Mantel$a=="0", colnames(metadata_Mantel[,c(26:42)])], # columns 26-42 will extract numerical values
                                 as.dist(as.matrix(IMOS_mar.dist.mat)[metadata_Mantel$a=="0", metadata_Mantel$a=="0"])) #[env.mat$epi=="EPI", env.mat$epi=="EPI"])) # I only need the geographic 
# distances, in km
knitr::kable(partial_Mantel_GOs_res %>% arrange(abs(statistic)),
             caption = "Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the functional level."
               )
Partial Mantel tests assessing which physico-chemical parameters mat act as significant drivers of seawater microbiomes at the functional level.
var statistic pval p.corr n.obs
median_Si_µM 0.0111393 0.3076923 1.0000000 191
SALINITY_2.5m_RV -0.0130414 0.6993007 1.0000000 191
median_TSS_mg_L -0.0242762 0.7142857 1.0000000 187
median_TDN_µM 0.0293237 0.1438561 1.0000000 191
median_NO2_µM 0.0304584 0.1388611 1.0000000 191
median_NO3_µM -0.0370669 0.8851149 1.0000000 191
median_NH4_µM 0.0511959 0.0459540 0.7812188 191
FLUORESCENCE_2.5m_RV 0.0532699 0.0279720 0.4755245 191
median_Phaeophytin_A_µg_L 0.0930029 0.0069930 0.1188811 191
median_Chlorophyll_A_µg_L 0.1126347 0.0019980 0.0339660 191
median_DOC_µM 0.1157628 0.0009990 0.0169830 191
median_PP_µM 0.1189701 0.0009990 0.0169830 187
median_PN_µM 0.2554285 0.0009990 0.0169830 191
median_TDP_µM 0.2583475 0.0009990 0.0169830 191
median_PO4_µM 0.2640840 0.0009990 0.0169830 191
median_POC_µM 0.2755923 0.0009990 0.0169830 191
SEAWATER_TEMPERATURE_2.5m_RV 0.2990635 0.0009990 0.0169830 191
# WQ
partial_Mantel_cor.mat_taxa_WQ <- data.frame(Taxonomy=partial_Mantel_taxa_res$statistic,
#                            GO_terms=IMOS_res_go_WQ$statistic, # Transcriptome=res.metaT$statistic,
                            row.names = partial_Mantel_taxa_res$var)

partial_Mantel_pcor.mat_taxa_WQ <- data.frame(Taxonomy=partial_Mantel_taxa_res$pval,
#                             GO_terms=IMOS_res_go_WQ$p.corr, # Expression=res.exp$pval,
                             row.names = partial_Mantel_taxa_res$var)# ,Transcriptome=res.metaT$pval)
# Ordering - highest correlations first
# WQ_ordre<-order(apply(IMOS_cor.mat_WQ[,1:2],1,mean),decreasing = T)

# WQ
partial_Mantel_cor.mat_GOs_WQ <- data.frame(Functions=partial_Mantel_GOs_res$statistic,
                            #                            GO_terms=IMOS_res_go_WQ$statistic, # Transcriptome=res.metaT$statistic,
                            row.names = partial_Mantel_GOs_res$var)

partial_Mantel_pcor.mat_GOs_WQ <- data.frame(Functions=partial_Mantel_GOs_res$pval,
                             #                             GO_terms=IMOS_res_go_WQ$p.corr, # Expression=res.exp$pval,
                             row.names = partial_Mantel_GOs_res$var)# ,Transcriptome=res.metaT$pval)
# Ordering - highest correlations first
# WQ_ordre<-order(apply(IMOS_cor.mat_WQ[,1:2],1,mean),decreasing = T)

# Let's visualise this! as heatmaps:
# Taxonomy
heatmap_partial_Mantels_taxa_WQ <- ggcorrplot(partial_Mantel_cor.mat_taxa_WQ,#[ordre,], # Strongest drivers first
              p.mat=partial_Mantel_pcor.mat_taxa_WQ,#[ordre,], # Strongest drivers first
              insig = "blank",
              sig.level = 0.05,
              method = "square",
              lab=T,
              lab_size = 2.5,
              colors=c("#2874b2","white","#ba2832"))
heatmap_partial_Mantels_taxa_WQ

# Functions
heatmap_partial_Mantels_GOs_WQ <- ggcorrplot(partial_Mantel_cor.mat_GOs_WQ,#[ordre,], # Strongest drivers first
              p.mat=partial_Mantel_pcor.mat_GOs_WQ,#[ordre,], # Strongest drivers first
              insig = "blank",
              sig.level = 0.05,
              method = "square",
              lab=T,
              lab_size = 2.5,
              colors=c("#2874b2","white","#ba2832"))
heatmap_partial_Mantels_GOs_WQ

# Merging the two
# patchwork::wrap_plots(heatmap_partial_Mantels_taxa_WQ,
#                      heatmap_partial_Mantels_GOs_WQ,
#                      nrow = 2,
#                      ncol = 1)

MINT sPLS